

import re
import requests
import csv


# 爬取豆瓣top250电影
# 访问豆瓣top250网页，获取源代码
# 根据re匹配想要的内容
# 写入cvs 一种数据格式，类似于Excel

# page: 0  25  50  75  100
for page in range(0,250,25):

    url = f'https://movie.douban.com/top250?start={page}&filter='
    top_header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
    }

    resp = requests.get(url,headers=top_header)
    # 拿到这一页的源代码
    result = resp.text
    # print(result)

    # 正则匹配
    re_obj = re.compile(r'<li>.*?<div class="item">.*?<div class="info">.*? <span class="title">(?P<movie_name>.*?)</span>'
                        r'.*?<p class="">.*?<br>(?P<year>.*?)&nbsp.*?<span class="rating_num" property="v:average">(?P<score>.*?)</span>',re.S)

    it = re_obj.finditer(result)
    # 写入cvs, mode=a为追加
    fo = open("movie_top250.csv",mode="a",encoding="utf-8")
    csvw = csv.writer(fo)
    for var in it:
        print(var.group("movie_name")  ,end="   ")
        print(var.group("year").strip(),end="   ")
        print(var.group("score"))
        dic = var.groupdict()
        dic['year'] = dic['year'].strip()
        csvw.writerow(dic.values())
